In this part of the exam, the questions are focused on the real_data_GDI dataset which is a part of the genderstat R package. To access the data you need to install the package. More information can be found on https://cran.r-project.org/web/packages/genderstat/index.html. Use the code below to access the data.
library(genderstat)
data("real_data_GDI")
names(real_data_GDI)
## [1] "country" "female_life_expectancy" "male_life_expectancy"
## [4] "female_mean_schooling" "male_mean_schooling" "female_gni_per_capita"
## [7] "male_gni_per_capita"
q1_1 <- length(unique(real_data_GDI$country))
paste(q1_1, "countries in the data")
## [1] "191 countries in the data"
variables <- names(real_data_GDI)
print("Missing Values by column")
## [1] "Missing Values by column"
q1_1.2 <- real_data_GDI %>%
summarise_all(list(function (x) sum(is.na(x))))
print(t(q1_1.2))
## [,1]
## country 0
## female_life_expectancy 0
## male_life_expectancy 0
## female_mean_schooling 12
## male_mean_schooling 12
## female_gni_per_capita 13
## male_gni_per_capita 13
q1_2.df <- real_data_GDI %>%
filter(complete.cases(.))
q1_2 <- unique(q1_2.df$country)
paste(length(q1_2), "countries in the data")
## [1] "174 countries in the data"
female.cols <- names(real_data_GDI)[startsWith(names(real_data_GDI), "female")]
male.cols <- names(real_data_GDI)[startsWith(names(real_data_GDI), "male")]
functions <- list(min=min, max=max)
q1_2.df %>%
select(all_of(male.cols)) %>%
summarise_all(list(min=min, max=max))
## male_life_expectancy_min male_mean_schooling_min male_gni_per_capita_min
## 1 50.4 2.2 797
## male_life_expectancy_max male_mean_schooling_max male_gni_per_capita_max
## 1 83.2 14.3 105348
q1_2.df %>%
select(all_of(female.cols)) %>%
summarise_all(list(min=min, max=max))
## female_life_expectancy_min female_mean_schooling_min
## 1 53.1 1.3
## female_gni_per_capita_min female_life_expectancy_max
## 1 176 88.3
## female_mean_schooling_max female_gni_per_capita_max
## 1 13.9 75094
sort.and.top <- function(df, index, col, top=5){
return (df %>%
select(all_of(c(index, col))) %>%
arrange_at(col, desc) %>%
head(n=top)
)
}
sort.and.top(q1_2.df, "country", "male_life_expectancy", 1)
## country male_life_expectancy
## 1 Australia 83.2
sort.and.top(q1_2.df, "country", "female_life_expectancy", 1)
## country female_life_expectancy
## 1 Hong Kong, China (SAR) 88.3
sort.and.top(q1_2.df, "country", "male_life_expectancy", 15)
## country male_life_expectancy
## 1 Australia 83.2
## 2 Hong Kong, China (SAR) 82.7
## 3 Switzerland 82.0
## 4 Japan 81.8
## 5 Norway 81.6
## 6 Malta 81.4
## 7 Iceland 81.2
## 8 Sweden 81.1
## 9 Canada 80.6
## 10 New Zealand 80.6
## 11 Singapore 80.6
## 12 Italy 80.5
## 13 Korea (Republic of) 80.4
## 14 Luxembourg 80.4
## 15 Ireland 80.2
sort.and.top(q1_2.df, "country", "female_life_expectancy", 15)
## country female_life_expectancy
## 1 Hong Kong, China (SAR) 88.3
## 2 Japan 87.7
## 3 Korea (Republic of) 86.8
## 4 Malta 86.1
## 5 Switzerland 85.9
## 6 Australia 85.8
## 7 Spain 85.8
## 8 France 85.5
## 9 Italy 85.1
## 10 Norway 84.9
## 11 Singapore 84.9
## 12 Sweden 84.9
## 13 Luxembourg 84.8
## 14 Canada 84.7
## 15 Finland 84.7
q1_6.df <- q1_2.df[q1_2.df$female_life_expectancy > 80 & q1_2.df$male_life_expectancy > 80, ] %>%
select(country)
print(q1_6.df)
## country
## 7 Australia
## 30 Canada
## 68 Hong Kong, China (SAR)
## 70 Iceland
## 75 Ireland
## 76 Israel
## 77 Italy
## 80 Japan
## 84 Korea (Republic of)
## 94 Luxembourg
## 100 Malta
## 113 New Zealand
## 118 Norway
## 141 Singapore
## 146 Spain
## 150 Sweden
## 151 Switzerland
In this question, we use the dataset that was created in Q1.2 (the dataset without the missing values).
Define a new categorical variable flife_cat in the following way: Re-code the variable female_life_expectancy into three categories:
female_life_expectancy <60: Low.
female_life_expectancy 60-80: Medium.
female_life_expectancy >80: High.
Count how many countries are included in each category.
categorize.life_exp <- function(row) {
if (row < 60) return("Low")
if (row <= 80) return("Medium")
else return("High")
}
q2_1.df <- q1_2.df %>%
mutate(flife_cat = factor(mapply(categorize.life_exp, female_life_expectancy), levels = c("Low", "Medium", "High"))) %>%
group_by(flife_cat) %>%
mutate(N = n()) %>%
ungroup()
q2_1.2.df <- q2_1.df %>%
select(flife_cat, N) %>%
distinct()
p1 <- ggplot(q2_1.2.df, aes(x="", y=N, fill=flife_cat, label=N)) +
geom_col(width=1, color="black") +
geom_text(position = position_stack(vjust=0.5)) +
coord_polar(theta="y", direction = 1) +
labs(fill="Category") +
theme_void()
p2 <- ggplot(q2_1.2.df, aes(x=flife_cat, y=N, fill=flife_cat, label=N)) +
geom_bar(stat="identity", color="black") +
geom_text(vjust=-0.25) +
labs(fill="Category") +
theme_void()
grid.arrange(p1, p2, ncol=2)
Figure 2.1
q2_3.df <- q2_1.df %>%
filter(flife_cat=="Low")
res <- q2_3.df %>%
select(country, male_life_expectancy) %>%
arrange(-male_life_expectancy) %>%
slice_head(n=3)
print(res)
## # A tibble: 3 × 2
## country male_life_expectancy
## <chr> <dbl>
## 1 Ivory Coast 57.4
## 2 South Sudan 53.4
## 3 Nigeria 52.3
q2_3.df %>%
summarise(mean_expectancy=mean(male_life_expectancy), sd_expectancy=sd(male_life_expectancy)) %>%
select(mean_expectancy, sd_expectancy)
## # A tibble: 1 × 2
## mean_expectancy sd_expectancy
## <dbl> <dbl>
## 1 52.6 2.56
In this question we use the real_data_GDI dataset without the missing values.
q3_1.dt <- q1_2.df %>%
filter(male_life_expectancy > 53)
paste(nrow(q3_1.dt))
## [1] "170"
data <- q3_1.dt$female_life_expectancy
n <- nrow(q3_1.dt)
u <- mean(data)
z <- qnorm(0.975)
ci <- c(lower=u - z * sd(data / sqrt(n)), upper=u + z * sd(data / sqrt(n)), mean=u)
ci
## lower upper mean
## 73.35136 75.59570 74.47353
q3_3.f <- function(x) {
x.test <- t.test(x, conf.level=0.95)
return(c(x.test$conf.int, x.test$estimate))
}
q3_3.f(q3_1.dt$female_life_expectancy)
## mean of x
## 73.34326 75.60380 74.47353
q3_4 <- interpretCI::meanCI(q3_1.dt$female_life_expectancy)$result
res <- c(lower=q3_4$lower, upper=q3_4$upper, mean=q3_4$m)
res
## lower upper mean
## 73.34326 75.60380 74.47353
In this question, we use the real_data_GDI dataset without the missing values.
ggplot(q2_1.df, aes(female_mean_schooling, female_life_expectancy, color=flife_cat)) +
geom_point() +
labs(color="Category") +
geom_text_repel(aes(label=ifelse(flife_cat == "Low", country, "")), color="black", vjust=0.2, size=2.5)
Figure 4.1
ggplot(q2_1.df, aes(female_mean_schooling, female_life_expectancy)) +
geom_point(aes(color=flife_cat, size=female_gni_per_capita, alpha=0.4)) +
labs(color="Category", size="gni per capita") +
guides(alpha="none")
Figure 4.2
cor <- cor.test(q2_1.df$female_mean_schooling, q2_1.df$female_life_expectancy)
cor$estimate
## cor
## 0.7812331
m4_4 <- lm(female_life_expectancy ~ female_mean_schooling, data=q2_1.df)
summary(m4_4)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58.642943 1.0101189 58.05549 6.451493e-115
## female_mean_schooling 1.760213 0.1072436 16.41322 4.893158e-37
ggplot(q2_1.df, aes(female_mean_schooling, female_life_expectancy)) +
geom_point() +
geom_smooth(method=lm, se=FALSE, formula = y ~ x) +
theme_bw()
Figure 4.3
/newpage
For the analysis of this part we use the flying data which is a part of the R package dropout. This is a modified version of the Flying Etiquette Survey data. More information can be found in https://CRAN.R-project.org/package=dropout. The code below can be used to access the data
library(dropout)
data("flying")
names(flying)
## [1] "respondent_id" "travel_frequency"
## [3] "seat_recline" "height"
## [5] "children_under_18" "two_armrests"
## [7] "middle_armrest" "window_shade"
## [9] "moving_to_unsold_seat" "talking_to_seatmate"
## [11] "getting_up_on_6_hour_flight" "obligation_to_reclined_seat"
## [13] "recline_seat_rudeness" "eliminate_reclining_seats"
## [15] "switch_for_friends" "switch_for_family"
## [17] "wake_passenger_bathroom" "wake_passenger_walk"
## [19] "baby_on_plane" "unruly_children"
## [21] "electronics_violation" "smoking_violation"
## [23] "gender" "age"
## [25] "household_income" "education"
## [27] "location_census_region" "survey_type"
flying.df <- data.frame(flying[complete.cases(flying), ])
nrow(flying.df)
## [1] 677
flying.df %>%
select(age, gender) %>%
group_by(age, gender) %>%
summarize(n=n(), .groups="keep")
## # A tibble: 8 × 3
## # Groups: age, gender [8]
## age gender n
## <chr> <chr> <int>
## 1 18-29 Female 75
## 2 18-29 Male 62
## 3 30-44 Female 78
## 4 30-44 Male 95
## 5 45-60 Female 95
## 6 45-60 Male 108
## 7 > 60 Female 87
## 8 > 60 Male 77
ggplot(flying.df, aes(gender, height, fill=gender)) +
geom_boxplot() +
geom_jitter(size=0.1) +
labs(fill="Gender")
Figure 5.1
ggplot(flying.df, aes(age, fill=gender)) +
geom_bar(position="dodge")
Figure 5.2
degrees.levels <- c("Less than high school degree",
"High school degree",
"Some college or Associate degree",
"Bachelor degree",
"Graduate degree"
)
age.levels <- c(
"18-29",
"30-44",
"45-60",
"> 60"
)
flying.df <- flying.df %>%
mutate(
education = factor(education, levels=degrees.levels),
age = factor(age, levels=age.levels)
)
ggplot(flying.df, aes(x = age, fill = gender)) +
geom_bar(position = "dodge") +
facet_wrap(~education, nrow=1, labeller = label_wrap_gen()) +
theme_bw() +
labs(x = "Age") +
theme(axis.text.x = element_text(angle = 45, vjust = 0.5))
Figure 5.3
In this question, we use the flying data without the missing values.
q6_1.df <- flying.df %>%
select(gender, baby_on_plane) %>%
group_by(gender, baby_on_plane) %>%
summarize(n=n(), .groups = "drop_last") %>%
mutate(percentage = n / sum(n) * 100)
q6_1.df
## # A tibble: 6 × 4
## # Groups: gender [2]
## gender baby_on_plane n percentage
## <chr> <chr> <int> <dbl>
## 1 Female No, not at all rude 255 76.1
## 2 Female Yes, somewhat rude 58 17.3
## 3 Female Yes, very rude 22 6.57
## 4 Male No, not at all rude 214 62.6
## 5 Male Yes, somewhat rude 89 26.0
## 6 Male Yes, very rude 39 11.4
ggplot(q6_1.df, aes(gender, percentage, fill=baby_on_plane, label=sprintf("%.1f %%", percentage))) +
geom_bar(stat="identity") +
geom_text(position = position_stack(vjust=0.5)) +
theme_bw()
Figure 6.1
q6_3.df <- flying.df %>%
select(age, gender, baby_on_plane) %>%
group_by(age, gender, baby_on_plane) %>%
summarize(n=n(), .groups = "drop_last") %>%
mutate(percentage=n/sum(n) * 100)
q6_3.df
## # A tibble: 24 × 5
## # Groups: age, gender [8]
## age gender baby_on_plane n percentage
## <fct> <chr> <chr> <int> <dbl>
## 1 18-29 Female No, not at all rude 47 62.7
## 2 18-29 Female Yes, somewhat rude 23 30.7
## 3 18-29 Female Yes, very rude 5 6.67
## 4 18-29 Male No, not at all rude 35 56.5
## 5 18-29 Male Yes, somewhat rude 19 30.6
## 6 18-29 Male Yes, very rude 8 12.9
## 7 30-44 Female No, not at all rude 62 79.5
## 8 30-44 Female Yes, somewhat rude 11 14.1
## 9 30-44 Female Yes, very rude 5 6.41
## 10 30-44 Male No, not at all rude 50 52.6
## # ℹ 14 more rows
# https://ggplot2.tidyverse.org/reference/labellers.html
ggplot(q6_3.df, aes(percentage,y="", fill=baby_on_plane)) +
geom_col(position = "stack") +
coord_polar(theta="x", direction = 1) +
facet_grid(gender~age, labeller = label_both) +
geom_label_repel(aes(label=sprintf("%.2f %%", percentage)), size=2.5, nudge_y = 0.8, show.legend = F) +
theme(axis.text.x = element_blank(), axis.title.x.bottom = element_blank())
Figure 6.2
answer_levels = c(
"Yes, very rude",
"Yes, somewhat rude",
"No, not at all rude"
)
q6_5.df <- flying.df %>%
select(respondent_id,
switch_for_family,
talking_to_seatmate,
moving_to_unsold_seat,
baby_on_plane,
wake_passenger_walk,
) %>%
gather("activity", "Response", -respondent_id) %>%
group_by(activity, Response) %>%
summarise(n=n(), .groups = "drop_last") %>%
mutate(percentage=n/sum(n) * 100) %>%
mutate(Response = ifelse(Response == "No, not rude at all", "No, not at all rude", Response)) %>%
mutate(Response = factor(Response, levels=answer_levels))
q6_5.manual.plot <- ggplot(q6_5.df, aes(x=percentage, y=activity, fill=Response, )) +
geom_bar(stat="identity", width=0.5) +
theme_minimal() +
theme(legend.position="bottom") +
scale_fill_manual(values = c("#5ab4ac", "#e5e5e5", "#d8b365")) +
labs(x="Percentage", y="Behavior", fill="Response") +
geom_text(aes(label=ifelse(Response == "Yes, somewhat rude", sprintf("%.0f %%", percentage) , "")),
position = position_stack(vjust=0.5)) +
geom_vline(xintercept = 0)
answer_levels = c(
"No, not at all rude",
"Yes, somewhat rude",
"Yes, very rude"
)
q6_5likert.df <- flying.df %>%
select(
wake_passenger_walk,
baby_on_plane,
moving_to_unsold_seat,
talking_to_seatmate,
switch_for_family,
) %>%
mutate(moving_to_unsold_seat = ifelse(moving_to_unsold_seat == "No, not rude at all", "No, not at all rude", moving_to_unsold_seat)) %>%
mutate_all(function(x) factor(x, levels=answer_levels))
p <- likert(data.frame(q6_5likert.df))
a <- likert.bar.plot(p, legend.position = "bottom", text.size = 3,
order = F,
group.order = c(
"wake_passenger_walk",
"baby_on_plane",
"talking_to_seatmate",
"moving_to_unsold_seat",
"switch_for_family"
),
ordered = F)
plot(a)
Figure 6.3
In this question we focus on the flying data without missing values.
q7_1.table <- flying.df %>%
select(gender, baby_on_plane) %>%
table()
q7_1.table
## baby_on_plane
## gender No, not at all rude Yes, somewhat rude Yes, very rude
## Female 255 58 22
## Male 214 89 39
q7_2.test <- chisq.test(flying.df$gender, flying.df$baby_on_plane)
q7_2.test
##
## Pearson's Chi-squared test
##
## data: flying.df$gender and flying.df$baby_on_plane
## X-squared = 14.789, df = 2, p-value = 0.0006148
q7_3.test.stat <- q7_2.test$statistic
q7_3.test.df <- q7_2.test$parameter
q7_3.test.density <- dchisq(q7_3.test.stat, q7_3.test.df)
label = sprintf("Statistic = %.2f with density %.4f", q7_3.test.stat, q7_3.test.density)
ggplot() +
stat_function(fun=dchisq, args=list(df=q7_3.test.df), color="gray") +
xlim(q7_3.test.stat - q7_3.test.stat / 2, q7_3.test.stat + q7_3.test.stat / 2) +
geom_vline(xintercept = q7_3.test.stat, color="black") +
labs(x="Test statistic", y="Density") +
geom_text(aes(x=q7_3.test.stat, y=q7_3.test.density, label=label), hjust=-0, vjust=-1)
Figure 7.1
Prepare a presentation of 5-10 slides using R markdown about the connection between the gender and the variable baby_on_plane. Make sure that your presentation includes:
Please note that you WILL NOT be asked to give the presentation and you WILL NOT be asked questions about the presentation. Your aim in this question is to demonstrate that you know how to use R markdown to make a presentation about your analysis. More details how to make a presentation using R markdown: https://rmarkdown.rstudio.com/lesson-11.html.
/newpage # Part 3: the unemp data In this part of the exam, the questions are focused on the unemp dataset which is a part of the viridis R package. To access the data you need to install the package. More information can be found in https://cran.r-project.org/web/packages/viridis/viridis.pdf. Use the code below to access the data.
library(viridis)
data(unemp)
names(unemp)
## [1] "id" "state_fips" "county_fips" "name" "year"
## [6] "rate" "county" "state"
For the unemp dataset,
q9_1.n <- nrow(unemp)
q9_1.n
## [1] 3218
length(unique(unemp$state))
## [1] 52
q9_2.counties.ny <- unemp %>%
filter(state == "NY") %>%
select(county) %>%
unique()
nrow(q9_2.counties.ny)
## [1] 62
unemp_NY <- unemp %>%
filter(state == "NY")
unemp_NY %>%
select(rate) %>%
summarize(min_rate_NY = min(rate),
max_rate_NY = max(rate),
mean_rate_NY = mean(rate),
)
## min_rate_NY max_rate_NY mean_rate_NY
## 1 5.6 13.3 8.009677
Create a new data frame named sub_unemp, which includes data of 3 states: GA, TX and VA.
sub_unemp <- unemp %>%
filter(state %in% c("GA", "TX", "VA"))
nrow(sub_unemp)
## [1] 547
p1 <- ggplot(sub_unemp, aes(x=state, y=rate, color = state)) +
geom_boxplot() +
theme(legend.position = "bottom")
p2 <- ggplot(sub_unemp, aes(x=state, y=rate, color = state)) +
geom_violin() +
theme(legend.position = "bottom")
layout <- matrix(c(1, 1, 2, 2, NA, 3, 3, NA), nrow = 2, byrow = TRUE)
p3 <- ggplot(sub_unemp, aes(x=rate, fill=state)) +
geom_density(alpha=0.7) +
theme(legend.position = "bottom")
p <- grid.arrange(p1, p2, p3, nrow=2, layout_matrix=layout)
Figure 10.1
ggsave("q10_3.png", p)
q10_4.df <- sub_unemp %>%
filter(state %in% c("TX", "VA")) %>%
select(rate, state)
q10_4.test <- t.test(rate ~ state, data = q10_4.df, alternative = "two.sided")
q10_4.test
##
## Welch Two Sample t-test
##
## data: rate by state
## t = 0.89662, df = 274.16, p-value = 0.3707
## alternative hypothesis: true difference in means between group TX and group VA is not equal to 0
## 95 percent confidence interval:
## -0.2592371 0.6928721
## sample estimates:
## mean in group TX mean in group VA
## 7.936220 7.719403
q10_5.ci <- q10_4.test$conf.int
q10_5.ci
## [1] -0.2592371 0.6928721
## attr(,"conf.level")
## [1] 0.95
q10_6.test <- var.test(rate ~ state, data = q10_4.df, alternative = "two.sided")
q10_6.test
##
## F test to compare two variances
##
## data: rate by state
## F = 1.0293, num df = 253, denom df = 133, p-value = 0.8618
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.7587649 1.3768044
## sample estimates:
## ratio of variances
## 1.029254
# Manually
s1_square <- var(sub_unemp %>%
filter(state == "TX") %>%
select(rate))
s2_square <- var(sub_unemp %>%
filter(state == "VA") %>%
select(rate))
f.test <- max(s1_square, s2_square)/min(s1_square, s2_square)
df <- c(nrow(sub_unemp %>%
filter(state == "TX") %>%
select(rate)) - 1,
nrow(sub_unemp %>%
filter(state == "VA") %>%
select(rate)) - 1)
q10_6.test <- 2 * pf(f.test, df[1], df[2], lower.tail = FALSE)
q10_6.test
## [1] 0.86178
/newpage # Part 4: the pigs data In this part, the questions are focused on the pigs dataset which is a part of the emmeans R package. To access the data you need to install the package. More information can be found by help(pigs). You can use the code below to access the data.
library(emmeans)
data(pigs)
names(pigs)
## [1] "source" "percent" "conc"
In this question, we use the pigs dataset without the missing values.
levels = c("low", "moderate", "high")
pigs.df <- pigs[complete.cases(pigs), ]
q11_1.f <- function(x) {
if (x == 18) return ("high")
else if (x == 12 || x == 15) return ("moderate")
else if (x == 9) return ("low")
}
pigs.df <- pigs.df %>%
mutate(percent_class = factor(lapply(percent, q11_1.f), levels = levels))
q11_1.ratio <- nrow(pigs.df[pigs.df$percent_class == "high",])/nrow(pigs.df)
q11_1.ratio
## [1] 0.1724138
q11_1.df <- pigs.df %>%
group_by(source, percent_class) %>%
summarize(n = n(), .groups = "drop_last") %>%
mutate(percentage_by_source = 100 * n/sum(n))
ggplot(q11_1.df, aes("", percentage_by_source, fill=percent_class, label=
ifelse(source=="skim", sprintf("%.2f %%", percentage_by_source),
sprintf("%.0f %%", percentage_by_source)))) +
geom_col() +
coord_polar(theta = "y", direction = 1) +
theme_grey() +
scale_fill_grey() +
geom_label_repel(size = 4, show.legend = F, position = position_stack(vjust=0.5), color="white") +
facet_wrap(~source, labeller = label_both) +
theme(axis.text.x = element_blank(),
axis.title.x.bottom = element_blank(),
axis.title.y.left = element_blank(),
panel.grid = element_blank()
)
Figure 11.1
pigs.df %>%
group_by(percent_class) %>%
summarize(count = n(), mean = mean(conc), sd = sd(conc))
## # A tibble: 3 × 4
## percent_class count mean sd
## <fct> <int> <dbl> <dbl>
## 1 low 8 32.7 5.74
## 2 moderate 16 38.9 7.81
## 3 high 5 39.9 12.1
q11_3.model <- aov(conc ~ percent_class, data = pigs.df)
summary(q11_3.model)
## Df Sum Sq Mean Sq F value Pr(>F)
## percent_class 2 246.8 123.4 1.858 0.176
## Residuals 26 1726.4 66.4
q11_5.f <- summary(q11_3.model)[[1]][["F value"]][[1]]
q11_5.f
## [1] 1.858447
ggplot(q11_3.model, aes(sample = residuals(q11_3.model))) +
stat_qq() +
stat_qq_line() +
labs(x = "Theoretical Quantiles", y = "Sample Quantiles")
Figure 11.2
ggplot(q11_3.model, aes(x = residuals(q11_3.model))) +
geom_histogram(bins = 10, color = "black", fill = "lightblue") +
labs(x = "Residuals", y = "Frequency")
Figure 11.3
/newpage # Part 5: the fish data
In this part we use the data fish which is a part of the rrcov R package. To access the data you need to install the package. More information can be found in https://search.r-project.org/CRAN/refmans/rrcov/html/fish.html. You can use the code below to access the data.
library(rrcov)
data(fish)
names(fish)
## [1] "Weight" "Length1" "Length2" "Length3" "Height" "Width" "Species"
In this question we use the fish dataset WITH the missing values.
q12_1.df <- fish %>%
group_by(Species) %>%
summarize(n = n())
q12_1.df
## # A tibble: 7 × 2
## Species n
## <int> <int>
## 1 1 35
## 2 2 6
## 3 3 20
## 4 4 11
## 5 5 14
## 6 6 17
## 7 7 56
fish2 <- fish[-14, ] # Or better fish2 <- fish[!is.na(fish$Weight), ]
ggplot(fish2, aes(x = Species, y = Weight, fill = as.factor(Species))) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("lightblue", "orange", "green", "blue", "red", "yellow", "pink")) +
labs(x = "Species", y = "Weight")
Figure 12.1
ggplot(fish2, aes(x = Weight, y = Width)) +
geom_point(size = 1) +
facet_wrap(~Species, ncol = 3, scales = "free")
Figure 12.2
For the new dataset defined in Q12.2.
# ini
q13.1.vector <- data.frame(Species = character(), Correlation = numeric())
for (i in unique(fish2$Species)) {
# append a new record to the dataset
cor.i <- cor(fish2$Weight[fish2$Species == i], fish2$Width[fish2$Species == i])
r.i <- data.frame(Species = i, Correlation = cor.i)
q13.1.vector <- rbind(q13.1.vector, r.i)
}
q13.1.vector
## Species Correlation
## 1 1 0.34444076
## 2 2 0.21565048
## 3 3 0.44890097
## 4 4 0.63814822
## 5 5 0.64815801
## 6 6 0.04297192
## 7 7 0.56262710
q13.2.vector <- fish2 %>%
group_by(Species) %>%
summarize(Correlation = cor(Weight, Width)) %>%
mutate(Correlation = round(Correlation, 3))
q13.2.vector
## # A tibble: 7 × 2
## Species Correlation
## <int> <dbl>
## 1 1 0.344
## 2 2 0.216
## 3 3 0.449
## 4 4 0.638
## 5 5 0.648
## 6 6 0.043
## 7 7 0.563
q14.1.df <- fish2 %>%
group_by(Species) %>%
summarize(
avg_w = round(mean(Weight), 0),
avg_L1 = round(mean(Length1), 1),
avg_h = round(mean(Height), 1)
)
q14.1.df
## # A tibble: 7 × 4
## Species avg_w avg_L1 avg_h
## <int> <dbl> <dbl> <dbl>
## 1 1 626 30.3 39.6
## 2 2 531 28.8 29.2
## 3 3 152 20.6 26.7
## 4 4 155 18.7 39.3
## 5 5 11 11.3 16.9
## 6 6 719 42.5 15.8
## 7 7 382 25.7 26.3
write.xlsx(q14.1.df, "q14.1.xlsx")
/newpage # Part 6: the msleep data
In this part we use the data msleep which is a part of the ggplot2 R package. To access the data you need to install the package. More information can be found in https://github.com/tidyverse/ggplot2/blob/main/data-raw/msleep.csv. You can use the code below to access the data.
library(dplyr)
data("msleep", package = "ggplot2")
head(msleep, 5)
## # A tibble: 5 × 11
## name genus vore order conservation sleep_total sleep_rem sleep_cycle awake
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Cheetah Acin… carni Carn… lc 12.1 NA NA 11.9
## 2 Owl mo… Aotus omni Prim… <NA> 17 1.8 NA 7
## 3 Mounta… Aplo… herbi Rode… nt 14.4 2.4 NA 9.6
## 4 Greate… Blar… omni Sori… lc 14.9 2.3 0.133 9.1
## 5 Cow Bos herbi Arti… domesticated 4 0.7 0.667 20
## # ℹ 2 more variables: brainwt <dbl>, bodywt <dbl>
nrow(msleep)
## [1] 83
q15.2.df <- msleep %>%
group_by(order) %>%
summarize(avg_sleep = round(mean(sleep_total), 2)) %>%
arrange(desc(avg_sleep))
q15.2.df
## # A tibble: 19 × 2
## order avg_sleep
## <chr> <dbl>
## 1 Chiroptera 19.8
## 2 Didelphimorphia 18.7
## 3 Cingulata 17.8
## 4 Afrosoricida 15.6
## 5 Pilosa 14.4
## 6 Rodentia 12.5
## 7 Diprotodontia 12.4
## 8 Soricomorpha 11.1
## 9 Primates 10.5
## 10 Erinaceomorpha 10.2
## 11 Carnivora 10.1
## 12 Scandentia 8.9
## 13 Monotremata 8.6
## 14 Lagomorpha 8.4
## 15 Hyracoidea 5.67
## 16 Artiodactyla 4.52
## 17 Cetacea 4.5
## 18 Proboscidea 3.6
## 19 Perissodactyla 3.47
q16_1.df <- msleep %>%
filter(!is.na(bodywt) & !is.na(brainwt)) %>%
mutate(pred = lm(brainwt ~ bodywt)$fitted.values)
ggplot(q16_1.df, aes(x = bodywt, color = bodywt)) +
geom_point(aes(y = brainwt)) +
geom_line(aes(y = pred)) +
labs(x = "Body Weight", y = "Brain weight") +
labs(title = "Scatter Plot of Body Weight and Brain Weight", color="Body Weight") +
theme_bw()
Figure 16.1
q16_2.df <- q16_1.df %>%
filter(bodywt > 2000)
ggplot(q16_1.df, aes(x = bodywt, color = bodywt)) +
geom_point(aes(y = brainwt)) +
geom_line(aes(y = pred)) +
labs(x = "Body weight", y = "Brain weight") +
labs(title = "Scatter Plot of Body Weight and Brain Weight", color="Body Weight") +
geom_text(data = q16_2.df, aes(x = bodywt, y=brainwt, label = bodywt), hjust = 0.5, vjust = -0.5) +
theme_bw()
Figure 16.2
q16_3.df <- msleep %>%
filter(!is.na(bodywt) & !is.na(brainwt)) %>%
filter(bodywt < 2000) %>%
mutate(pred = lm(brainwt ~ bodywt)$fitted.values)
ggplot(q16_3.df, aes(x = bodywt, )) +
geom_point(aes(y = brainwt, color = bodywt)) +
geom_smooth(aes(y = brainwt), method = "lm", se = T, formula = y ~ x, linewidth=0) +
geom_line(aes(y = pred, color = bodywt)) +
labs(x = "Body weight", y = "Brain weight") +
labs(title = "Scatter Plot of Body Weight and Brain Weight", color="Body Weight") +
xlim(0, 1000) +
theme_bw()
Figure 16.3
/newpage # Part 7: the ChickWeight data
In this part we use the ChickWeight data which is a part of the R datasets. To access the data you need to install the package. More information can be found in https://www.rdocumentation.org/packages/datasets/versions/3.6.2/topics/ChickWeight. You can use the code below to access the data.
data(ChickWeight)
head(ChickWeight)
## Grouped Data: weight ~ Time | Chick
## weight Time Chick Diet
## 1 42 0 1 1
## 2 51 2 1 1
## 3 59 4 1 1
## 4 64 6 1 1
## 5 76 8 1 1
## 6 93 10 1 1
Write a function that receives a dataset and a variable as an input and output returns the mean, median, and standard deviation of the variable rounded to 2 decimal places. Apply this function to the ChickWeight data and the variable weight.
In the output below, both numerical and graphical output were
produced using the user function my.analysis(). The function
receives as an input: (1) a dataset name, (2) the column number of
variable 1 (a numerical variable) and (3) the column number of variable
2 (a factor). Note that both variable 1 and variable 2 are a part of the
dataset. For the analysis in this question we use the
ChickWeight dataset at time 0 (so only observations at time 0
are included). The output was produce using the following code:
my.analysis(ChickWeight0,1,4).
The dataset ChickWeight0 contains the observations that were
measured at time 0. Based on the output below, your task in the question
is to write the function my.analysis() and to produce the
output using the code above. Note that your function should produce an
identical output.
q17_1.f <- function(df, variable) {
mean <- round(mean(df[[variable]], na.rm = T), 2)
median <- round(median(df[[variable]], na.rm = T), 2)
sd <- round(sd(df[[variable]], na.rm = T), 2)
return(c(mean, median, sd))
}
q17_1.f(ChickWeight, "weight")
## [1] 121.82 103.00 71.07
my.analysis <- function(df, variable1, variable2) {
var1.name <- names(df)[variable1]
var2.name <- names(df)[variable2]
temp.df <- df %>%
mutate_at(var2.name, as.factor) %>%
mutate_at(var1.name, as.numeric) %>%
group_by_at(var2.name) %>%
summarise(mean = mean(.data[[var1.name]], na.rm = T),
SD = sd(.data[[var1.name]], na.rm = T),
n = n())
print("Summary statistics")
print(temp.df)
var2 <- var2.name
model <- aov(df[, var1.name] ~ df[, var2])
print("ANOVA table")
print(summary(model))
print("Sample mean and 95% CI")
temp.df <- temp.df %>%
mutate(lb = mean - 1.96 * SD / sqrt(n),
ub = mean + 1.96 * SD / sqrt(n))
p <- ggplot(temp.df, aes(x = factor(Diet), y = mean)) +
geom_point(color = "blue", size = 3) +
geom_linerange(aes(ymin = lb, ymax = ub), color = "blue") +
labs(y = "Weight", x = "Diet", title = "Sample means and 95% CI") +
theme_bw()
print(p)
print("Plot of the Residuals")
p <- ggplot() +
geom_point(aes(x=model$model[, 2], y =residuals(model)), color = "blue", shape=23, fill="blue", size=2) +
labs(x=var2.name, y="Residuals")
print(p)
}
ChickWeight0 <- ChickWeight %>%
filter(Time == 0)
my.analysis(ChickWeight0, 1, 4)
## [1] "Summary statistics"
## # A tibble: 4 × 4
## Diet mean SD n
## <fct> <dbl> <dbl> <int>
## 1 1 41.4 0.995 20
## 2 2 40.7 1.49 10
## 3 3 40.8 1.03 10
## 4 4 41 1.05 10
## [1] "ANOVA table"
## Df Sum Sq Mean Sq F value Pr(>F)
## df[, var2] 3 4.32 1.440 1.132 0.346
## Residuals 46 58.50 1.272
## [1] "Sample mean and 95% CI"
## [1] "Plot of the Residuals"
Create the boxplot shown in Figure 18.1 to visualize the distribution of weights for each time point in the ChickWeight dataset. Color the boxplot based on the Time variable.
Create an interactive boxplot, shown in Figure 18.2, to visualize the distribution of weights for each time point in the ChickWeight dataset. Color the boxplot based on the Time variable. DO NOT include this figure in the PDF document for your answers but ONLY in the HTML document.
q18_1.plot <- ggplot(ChickWeight, aes(x = as.factor(Time), y = weight)) +
geom_boxplot(aes(fill = factor(Time)), outlier.shape = NA) +
labs(title = "Boxplot of Weight by Time") +
theme_bw()
print(q18_1.plot)
Figure 18.1
ggplotly(q18_1.plot)
Figure 18.2
/newpage # Part 8: the Quakes data
In this part we use the data quakes which is a part of the R datasets collection. Use help() to get more information about the data. More information can be found in https://www.rdocumentation.org/packages/datasets/versions/3.6.2/topics/quakes. You can use the code below to access the data.
library(datasets)
data("quakes")
head(quakes)
## lat long depth mag stations
## 1 -20.42 181.62 562 4.8 41
## 2 -20.62 181.03 650 4.2 15
## 3 -26.00 184.10 42 5.4 43
## 4 -17.97 181.66 626 4.1 19
## 5 -20.42 181.96 649 4.0 11
## 6 -19.68 184.31 195 4.0 12
lat),
longitude (long), and depth (depth) of
earthquakes in the quakes dataset. DO NOT
include this figure in the PDF document for your answers but ONLY in the
HTML document.scatter3D(quakes$long, quakes$lat, quakes$depth, colvar = quakes$mag, pch = 16, ticktype = "detailed",
col = viridis(n=100), size = 0.5, bty = "u", col.axis = "gray", col.grid = "gray",
xlab = "\nX", ylab = "\nY", zlab = "\nZ", cex.axis = 0.6)
Figure 19.1
plot_ly(quakes, x = ~lat, y = ~long, z = ~depth, type="scatter3d", size=I(100), mode="markers", color=~mag)
Figure 19.2
quakes %>%
group_by(stations) %>%
summarise(mean_mag = mean(mag))
## # A tibble: 102 × 2
## stations mean_mag
## <int> <dbl>
## 1 10 4.23
## 2 11 4.23
## 3 12 4.20
## 4 13 4.33
## 5 14 4.28
## 6 15 4.28
## 7 16 4.27
## 8 17 4.35
## 9 18 4.44
## 10 19 4.38
## # ℹ 92 more rows
highest_25 <- quakes %>%
group_by(stations) %>%
summarise(max_mag = max(mag)) %>%
arrange(desc(max_mag)) %>%
head(25)
q20_2.df <- quakes %>%
filter(stations %in% highest_25$stations) %>%
arrange(desc(mag))
nrow(q20_2.df)
## [1] 52
q20_3.df <- q20_2.df %>%
mutate(ratio = mag / depth) %>%
arrange(desc(ratio))
q20_4.df <- q20_3.df %>%
group_by(stations) %>%
summarise(mean_ratio = mean(ratio)) %>%
arrange(desc(mean_ratio)) %>%
head(3)
print(q20_4.df$stations)
## [1] 76 83 123
q20_5.df <- q20_3.df %>%
filter(ratio > 0.099)
ggplot(q20_5.df, aes(x = ratio, y = depth)) +
geom_point() +
labs(title = "Boxplot of Ratio by Station") +
theme_bw()
Figure 20.1
q20_6.df <- q20_3.df %>%
mutate(mat_cat = ifelse(mag > mean(mag), 1, 0))
ggplot(q20_6.df, aes(x = ratio, y = depth, color = as.factor(stations))) +
geom_point() +
labs(title = "Boxplot of Ratio by Station") +
facet_wrap(~mat_cat, labeller = label_both) +
theme_bw()
Figure 20.2